In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import csv
%run 'preprocessor.ipynb' #our own preprocessor functions
In [2]:
with open('data_w1w4.csv', 'r') as f:
reader = csv.reader(f)
data = list(reader)
matrix = obtain_data_matrix(data)
samples = len(matrix)
print("Number of samples: " + str(samples))
Y = matrix[:,[8]]
X = matrix[:,[9]]
S = matrix[:,[11]]
In [3]:
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X, Y)
# Make predictions using the testing set
Y_pred = regr.predict(X)
In [4]:
fig = plt.figure(1, figsize=(10, 4))
plt.scatter([X], [Y], color='blue', edgecolor='k')
plt.plot(X, Y_pred, color='red', linewidth=1)
plt.xticks(())
plt.yticks(())
print('Coefficients: ', regr.coef_)
plt.show()
# The mean squared error
print("Mean squared error: %.2f"
% mean_squared_error(Y, Y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(Y, Y_pred))
In [5]:
from sklearn.utils import resample
bootstrap_resamples = 5000
intercepts = []
coefs = []
for k in range(bootstrap_resamples):
#resample population with replacement
samples_resampled = resample(X,Y,replace=True,n_samples=len(X))
## Fit model to resampled data
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(samples_resampled[0], samples_resampled[1])
coefs.append(regr.coef_[0][0])
intercepts.append(regr.intercept_[0])
In [6]:
alpha = 0.95
p_lower = ((1-alpha)/2.0) * 100
p_upper = (alpha + ((1-alpha)/2.0)) * 100
coefs_lower = np.percentile(coefs,p_lower)
coefs_upper = np.percentile(coefs,p_upper)
intercepts_lower = np.percentile(intercepts,p_lower)
intercepts_upper = np.percentile(intercepts,p_upper)
print('Coefs %.0f%% CI = %.5f - %.5f' % (alpha*100,coefs_lower,coefs_upper))
print('Intercepts %.0f%% CI = %.5f - %.5f' % (alpha*100,intercepts_lower,intercepts_upper))
In [7]:
plt.hist(coefs)
plt.xlabel('Coefficient X0')
plt.title('Frquency Distribution of Coefficient X0')
plt.show()
plt.hist(intercepts)
plt.xlabel('Intercept')
plt.title('Frquency Distribution of Intercepts')
plt.show()